Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.

library(knitr)
library(reticulate)

#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
library(geosphere)

#Data wrangling and data viz
library(tidyverse)
library(purrr)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
library(ggtext)
library("UpSetR")
library(scatterpie)


#Pop structure and pop genomic
library(GenomicFeatures)
library(gridExtra)

library(ggtree)
library(tidytree)
library(multcomp)
library(lsmeans)



#GEA
library(lfmm)
library(GWASTools)
library(topGO)

#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)


#Variables
world <- map_data("world")
project_dir="/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/"
lists_dir = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/WW_PopGen/Keep_lists_samples/"

#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
fig_dir = "/Users/feurtey/Documents/Postdoc_Bruce/Manuscripts/Feurtey_WW_Zt/Draft_figures/"

#Analysis directories
#-___________________
VAR_dir = paste0(project_dir, "1_Variant_calling/")
  depth_per_window_dir = paste0(VAR_dir, "1_Depth_per_window/")
  vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
  mito_SV = paste0(VAR_dir, "6_Mito_SV/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
  nuc_PS_dir=paste0(PopStr_dir, "0_Nuclear_genome/")
  mito_PS_dir = paste0(PopStr_dir, "1_Mitochondrial_genome/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
   RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
   DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
  gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")

#Files
vcf_name="Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.biall_SNP.max-m-1.maf-0.05.thin-1000bp"
vcf_name_nomaf="Ztritici_global_March2021.filtered-clean.AB_filtered.SNP.max-m-0.8.thin-1000bp"
vcf_name_mito = "Ztritici_global_March2021.genotyped.mt.filtered.clean.AB_filtered.variants.good_samples.max-m-80"
Zt_list = paste0(lists_dir, "Ztritici_global_March2021.genotyped.good_samples.args")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.no_CDS.gff3")
effectors_annotation_file = "/Users/feurtey/Documents/Postdoc_Eva/Manuscripts/Accepted/Alice_Cecile_Comparative_genomics/Data_for_publication/Annotations_2018_genomes_for_publication.tab"
eggnog_annotation = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.eggnog")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
metadata_name = "Main_table_from_SQL_Feb_2020"
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
complete_mito = read_tsv(paste0(data_dir, "Complete_mitochondria_from_blast.txt"), col_names = c("ID_file", "Contig"))

Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(MITOPOPSTR=mito_PS_dir)

Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)

Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
Sys.setenv(VCFNAME_MITO=vcf_name_mito)

#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)


# Metadata and sample lists
##Filtered_samples
filtered_samples = bind_rows(
  read_tsv(paste0(metadata_dir, "Sample_removed_based_on_IBS.args"), col_names = "ID_file") %>%
  mutate(Filter = "IBS"),
read_tsv(paste0(metadata_dir, "Sample_with_too_much_NA.args"), col_names = "ID_file") %>%
  mutate(Filter = "High_NA"),
read_tsv(paste0(metadata_dir, "Samples_to_filter_out.args"), col_names = "ID_file") %>%
  mutate(Filter = "Mutants_etc"))

##Samples in vcf
genotyped_samples = read_tsv(Zt_list, col_names = "ID_file")

## Metadata of genotyped samples 
temp = read_tsv(paste0(metadata_dir, metadata_name, "_Description.tab"), col_names = F) %>% pull()

Zt_meta = read_tsv(paste0(metadata_dir, metadata_name, "_with_collection.tab"), 
                 col_names = temp,
                 na = "\\N", guess_max = 2000) %>%
  unite(Coordinates, Latitude, Longitude, sep = ";", remove = F) %>%
  inner_join(., genotyped_samples)  %>%
  mutate(Country = ifelse(Country == "USA", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "Australia", paste(Country, Region, sep = "_"), Country)) %>%
  mutate(Country = ifelse(Country == "NZ", "New Zealand", Country)) %>%
  mutate(Country = ifelse(Country == "CH", "Switzerland", Country)) %>%
  mutate(Latitude2 = round(Latitude, 2), Longitude2 = round(Longitude, 2))

#genotyped_samples %>%
#  filter(!(ID_file %in% filtered_samples$ID_file)) %>%
#    write_tsv(Zt_list, col_names = F)



#Define colors
## For continents
#myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
myColors <- c("#DA4167", "grey", "#ffba0a", "#A20106", "#3F6E0C", "#129eba", "#8fa253" )
names(myColors) = levels(factor(Zt_meta$Continent))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent", values = myColors)

## For clustering
K_colors = c("#f9c74f", "#f9844a", "#90be6d", "#f5cac3", 
"#83c5be", "#f28482", "#577590", "#e5e5e5", "#a09abc",  "#52796f",
"#219ebc", "#003049", "#82c0cc", "#283618", "white")

## For correlations
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)

#Random gradients
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")

#Bioclim data (from Worldclim)
Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
                "Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
                "Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
                "Temperature Annual Range (BIO5-BIO6)",
                "Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
                "Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
                "Annual Precipitation", "Precipitation of Wettest Month",
                "Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
                "Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
                "Precipitation of Warmest Quarter","Precipitation of Coldest Quarter")

pheno_cat = tibble(Phenotype = c(
                     "Max Temperature of Warmest Month", "Mean Temperature of Warmest Quarter",
                     "Min Temperature of Coldest Month", "Mean Temperature of Coldest Quarter",
                     "Precipitation of Driest Month", "Precipitation of Driest Quarter",
                     "Precipitation of Wettest Month", "Precipitation of Wettest Quarter"),
       Category = c("Warmth", "Warmth", "Cold", "Cold",
                    "Drought", "Drought", "Rain", "Rain"))
pheno_table = tibble(BIO_ID = paste0("BIO", c(1:19)),
                     Phenotype = Bioclim_var,
                     Category = c("Temperature", "Temperature range", "Temperature range", "Temperature range",
                                  "Temperature", "Temperature", "Temperature range", "Temperature/rain",
                                  "Temperature/rain", "Temperature", "Temperature", "Precipitation",
                                  "Precipitation", "Precipitation", "Precipitation range", "Precipitation",
                                  "Precipitation", "Temperature/rain", "Temperature/rain"))


#Bioclim_var[c(2, 3, 7)] = "Ranges bioclimatic variables"
#Bioclim_var[c(4, 15)] = "Seasonality bioclimatic variables"
#Bioclim_var[c(1, 12) = "Annual rain and temperature"
#Bioclim_var[c(8, 9, 17, 18)] = "Mix rain and temperature" 

GEA: genotype-environment association


Geographic/environmental data

I will use the sampling site of each isolate (as precisely as I can manage) to approximate environmental parameters such as temperature, or precipitation. One possibility to find such data is this website, https://www.worldclim.org/data/worldclim21.html (published in Fick and Hijmans, 2017), which gives access to climate data for 1970-2018. These can be transformed into bioclimatic variables using the biovars function from the R package dismo (https://rdrr.io/cran/dismo/man/biovars.html).

temp = Zt_meta %>%
  #filter(!(ID_file %in% filtered_samples$ID_file)) %>%
  filter(!is.na(Longitude)) %>%
  mutate(X = as.numeric(unlist(Longitude)),
                            Y = as.numeric(unlist(Latitude))) %>%
  dplyr::select(X, Y) %>%
  distinct()

sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
##         min      max
## X -122.9300 175.6576
## Y  -46.2187  59.3294
## Is projected: NA 
## proj4string : [NA]
## Number of points: 292
bio_list = list()
for (i in c(1:length(Bioclim_var))) {
  file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
  rast_temp = raster(file_name)    
  bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}

climate_per_coordinates = cbind(temp, do.call(cbind, bio_list))
dat = Zt_meta %>%
  #filter(!(ID_file %in% filtered_samples$ID_file)) %>% 
  dplyr::select(ID_file, Latitude, Longitude, Continent) %>%
  full_join(., climate_per_coordinates,
                by= c("Longitude" = "X", "Latitude" = "Y")) %>%
  dplyr::select(-ID_file) %>%
  gather(key = "Bioclim_var", value = "Estimate", -c(Longitude, Latitude, Continent))

#Define stable colors
#myColors = c("#ec9a29", "#0f8b8d", "#143642")
#names(myColors) = levels(factor(dat$Continent))
#colScale = scale_colour_manual(name = "Continent", values = myColors)
#colScaleFill = scale_fill_manual(name = "Continent", values = myColors)

ggplot(dat, aes(Estimate, fill =Continent, color =Continent)) +
  geom_histogram(position = "stack") +
  facet_wrap(.~Bioclim_var, scales = "free") +
  theme_classic() + Color_Continent + Fill_Continent

Naturally, many of the variables investigated above are highly correlated. It is intuitive for example that the minimum temperature of the coldest month would be correlated to the average temperature of the coldest quarter! Here, I visualize these correlations.

#Correlogram

Ccor = cor(climate_per_coordinates[, c(3:ncol(climate_per_coordinates))])
corrplot(Ccor, type="lower", order="hclust",
         col = mycolorsCorrel,
         tl.col="black", tl.srt=45, tl.cex = 0.7)

#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)

Ccor[abs(Ccor) <= 0.75] <-0
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)

tidy_cors <- climate_per_coordinates %>%
  dplyr::select(everything(), -contains("uarter"), -Y, -X) %>%
  correlate() %>%
  stretch() 

graph_cors <- tidy_cors %>%
  #filter(abs(r) > .2) %>%
  graph_from_data_frame(directed = FALSE)

ggraph(graph_cors) +
  geom_edge_link(aes(edge_alpha = .5, color = r)) +
  guides(edge_alpha = "none", edge_width = "none") +
  scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "white", "#a8201a")) +
  #geom_node_point(color = "black", size = 3) +
  geom_node_text(aes(label = name), size = 2) +
  theme_graph() +
  labs(title = "Correlations between bioclimatic variables")

#
tidy_cors <- climate_per_coordinates %>%
  dplyr::select( -Y, -X) %>%
  correlate() %>%
  stretch() 

graph_cors <- tidy_cors %>%
  #filter(abs(r) > .75) %>%
  mutate(r = ifelse(abs(r) <= 0.75, 0, r)) %>%
  graph_from_data_frame(directed = FALSE)

ggraph(graph_cors) +
  geom_edge_link(aes(edge_alpha = .5, color = r)) +
  guides(edge_alpha = "none", edge_width = "none") +
  scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "white", "#a8201a")) +
  #geom_node_point(color = "black", size = 3) +
  geom_node_text(aes(label = name), size = 2) +
  theme_graph() +
  labs(title = "Correlations between bioclimatic variables")

Prepare and run GEA

#Create standardized values for the environment variables

climate_per_coord_standard = climate_per_coordinates %>%
  pivot_longer(cols = -c(X, Y), names_to = "Variable_climate", values_to = "Value") %>%
  group_by(Variable_climate) %>%
  mutate(Standard_value = scale(Value)) %>%
  dplyr::select(-Value) %>%
  pivot_wider(names_from = Variable_climate, values_from = Standard_value)
#The fam file should be same for all core chromosomes, hopefully. So I only have to create one to get the file format including the phenotypes. This can then be used for all chromosomes.

temp2 = left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file), 
                            Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
  bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
  left_join(., climate_per_coord_standard,
                   by = c("Latitude" = "Y", "Longitude" = "X")) 

dplyr::select(temp2, -Latitude, -Longitude) %>%
 write_delim(., 
              "~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta.fam", 
               delim = " ", col_names = F)

left_join(read_tsv(Zt_list, col_names = "ID_file") %>% mutate(ID2 = ID_file), 
                            Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
  bind_cols(., tibble(X1 = rep(0, nrow(Zt_meta)), X2 = rep(0, nrow(Zt_meta)), X3 = rep(0, nrow(Zt_meta)))) %>%
  left_join(., climate_per_coordinates,
                   by = c("Latitude" = "Y", "Longitude" = "X")) %>%
  dplyr::select(-Latitude, -Longitude) %>%
 write_delim(., 
              "~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta_non_standard.fam", 
               delim = " ", col_names = F)
#Transfer on the cluster
#rsync -avP ~/Documents/Postdoc_Bruce/Projects/WW_project/5_GEA/Ztmeta.fam alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Phenotypes_with_climatic_variables.fam

#To run gemma on the cluster: conda activate env0
#Commands are then in the format gemma -h 

Identify variants significantly associated to bioclimatic variables

First, I read the results that I got from running GEMMA with a LOCO strategy (i.e., leave-one-chromosome-out).

results_GEMMA = list()
for (i in c(1:length(Bioclim_var))) {
  temp = list.files(path = GEA_dir, pattern = paste0("*_for_phenotype_", i,".assoc.txt")) %>% 
    map_df(~read_tsv(file = paste0(GEA_dir, .), col_types = c("dcddccdddddd"))) %>%
    unite(col = SNP, chr, ps, sep = "_", remove = F) %>%
    dplyr::select(SNP, CHR = chr, BP = ps, P = p_wald, zscore = logl_H1) %>%
    mutate(Phenotype = Bioclim_var[i], Nb = i)
  results_GEMMA[[Bioclim_var[i]]] = temp
}

results_GEMMA = bind_rows(results_GEMMA)


core_temperature_variable = c("Max Temperature of Warmest Month", "Min Temperature of Coldest Month")

core_rain_variable = c("Precipitation of Wettest Month", "Precipitation of Driest Month")
#Genomic Inflation Factor
kable(results_GEMMA %>% 
        group_by(Phenotype) %>%
        summarize(Genomic_Inflation_Factor = median(qchisq(1-P,1))/qchisq(0.5,1)) %>%
        arrange(Genomic_Inflation_Factor))
Phenotype Genomic_Inflation_Factor
Precipitation of Warmest Quarter 1.061468
Mean Temperature of Wettest Quarter 1.065765
Precipitation of Driest Month 1.080538
Precipitation Seasonality (Coefficient of Variation) 1.086267
Mean Temperature of Driest Quarter 1.086815
Mean Diurnal Range 1.089768
Isothermality (BIO2/BIO7) (×100) 1.101098
Max Temperature of Warmest Month 1.106971
Annual Precipitation 1.109024
Precipitation of Coldest Quarter 1.109791
Precipitation of Driest Quarter 1.110470
Mean Temperature of Warmest Quarter 1.113279
Temperature Seasonality (standard deviation ×100) 1.127099
Min Temperature of Coldest Month 1.127293
Mean Temperature of Coldest Quarter 1.130714
Precipitation of Wettest Quarter 1.131847
Temperature Annual Range (BIO5-BIO6) 1.142874
Precipitation of Wettest Month 1.146354
Annual Mean Temperature 1.149803
#QQ-plots
par(mfrow=c(3,3)) 
for (i in c(1:length(Bioclim_var))){
temp = results_GEMMA %>% filter(Phenotype == Bioclim_var[i]) %>% dplyr::sample_frac(size = .10) %>% dplyr::select(P) %>% pull()
GWASTools::qqPlot(temp, thinThreshold = 2)
}

To determine which variants are significant, we use the Bonferroni correction method, i.e. we divide the traditional threshold of significance of 0.05 by the number of variants that we tested. Based on this corrected threshold, we can then identify variants significantly associated with each environmental condition that we tested.

head(results_GEMMA) 
## # A tibble: 6 x 7
##   SNP        CHR     BP      P zscore Phenotype                  Nb
##   <chr>    <dbl>  <dbl>  <dbl>  <dbl> <chr>                   <int>
## 1 1_110857     1 110857 0.612   -981. Annual Mean Temperature     1
## 2 1_111115     1 111115 0.0460  -979. Annual Mean Temperature     1
## 3 1_111124     1 111124 0.594   -980. Annual Mean Temperature     1
## 4 1_111127     1 111127 0.599   -980. Annual Mean Temperature     1
## 5 1_111168     1 111168 0.555   -980. Annual Mean Temperature     1
## 6 1_111283     1 111283 0.794   -981. Annual Mean Temperature     1
Bonferroni_threshold = results_GEMMA %>% dplyr::count(Nb) %>%
  mutate(Bonferroni_threshold = 0.05/n) %>% summarize(average = mean(Bonferroni_threshold)) %>% pull()

kable(results_GEMMA %>% 
  filter(P <= Bonferroni_threshold) %>% 
  dplyr::count(Phenotype) %>% 
    arrange(n))
Phenotype n
Precipitation of Driest Month 36
Precipitation of Driest Quarter 38
Mean Temperature of Driest Quarter 41
Mean Temperature of Wettest Quarter 58
Mean Temperature of Warmest Quarter 60
Precipitation of Wettest Month 66
Mean Diurnal Range 81
Annual Precipitation 82
Max Temperature of Warmest Month 85
Precipitation of Wettest Quarter 98
Precipitation of Warmest Quarter 136
Temperature Seasonality (standard deviation ×100) 181
Precipitation of Coldest Quarter 194
Temperature Annual Range (BIO5-BIO6) 253
Annual Mean Temperature 263
Mean Temperature of Coldest Quarter 344
Isothermality (BIO2/BIO7) (×100) 353
Precipitation Seasonality (Coefficient of Variation) 399
Min Temperature of Coldest Month 536
significant = results_GEMMA %>%
  filter(P <= Bonferroni_threshold)  %>% group_by(SNP, CHR, BP) %>%
  mutate(Count = n()) %>% mutate(SNP_type = "significant")%>% 
  ungroup() 

#temp = significant %>% ungroup() %>% pivot_wider(values_fill = 1, values_from = P, names_from = Phenotype) %>%
#  dplyr::select(-CHR, -BP, - zscore, -Nb)

significant %>%
  dplyr::select(CHR, BP) %>% 
  distinct() %>%
  write_delim(paste0(GEA_dir, "Significant_GEMMA.tsv"), delim = "\t", col_names = F)

Identifying the variants which are significant is very useful already, but it is hard to work with so many “independent” positions. It also does not make sense conceptually, as nearby variants are probably picking up the exact same signal through linkage. So, here I gather variants together into “significant loci” if they are no bigger gaps than a certain threshold.

#Distance between two significant SNPs to include them in the same region
len_threshold = 20000L

#For each phenotype and for each chromosome

list_loci = list()
for (j in c(1:length(Bioclim_var))) {
  temp2 = list()
  for (i in c(1:13)){
    #Pull vector of position for the right chromosome and phenotype
    v = filter(significant, CHR == i) %>% 
      filter(Phenotype == Bioclim_var[j]) %>%
      dplyr::select(CHR, BP) %>% 
      distinct() %>% pull(BP) %>% sort()
    
    # Split the vectors into group based on the length threshold
    temp = split(v, cumsum(c(TRUE, diff(v) >= len_threshold)))
    
    #Single value gets the column name "value" instead of a number
    if (length(temp) == 1) {
      temp_tibble = as.tibble(sapply(temp, '[', seq(max(sapply(temp, length)))))
      colnames(temp_tibble) <- 1
    } else {
      temp_tibble = as.tibble(sapply(temp, '[', seq(max(sapply(temp, length)))))
    }
    
    # Transform to store relevant info in tidy format
    temp2[[i]] = temp_tibble %>% 
      pivot_longer(cols =everything(), names_to = "Locus_nb", values_to = "BP") %>%
      filter(complete.cases(.)) %>%
      mutate(CHR = i, Phenotype = Bioclim_var[j]) %>% 
      ungroup()
  }
list_loci[[j]] = bind_rows(temp2)
}

#Adding loci info to the table
significant = bind_rows(list_loci) %>%
  full_join(significant, .)

significant %>% dplyr::select(Locus_nb, Phenotype) %>%
  distinct() %>% 
  dplyr::count(Phenotype)
## # A tibble: 19 x 2
##    Phenotype                                                  n
##    <chr>                                                  <int>
##  1 "Annual Mean Temperature"                                 18
##  2 "Annual Precipitation"                                     7
##  3 "Isothermality (BIO2/BIO7) (×100)"                        26
##  4 "Max Temperature of Warmest Month"                        11
##  5 "Mean Diurnal Range "                                      7
##  6 "Mean Temperature of Coldest Quarter"                     18
##  7 "Mean Temperature of Driest Quarter"                       6
##  8 "Mean Temperature of Warmest Quarter"                      7
##  9 "Mean Temperature of Wettest Quarter"                      5
## 10 "Min Temperature of Coldest Month"                        17
## 11 "Precipitation of Coldest Quarter"                        12
## 12 "Precipitation of Driest Month"                            3
## 13 "Precipitation of Driest Quarter"                          3
## 14 "Precipitation of Warmest Quarter"                        11
## 15 "Precipitation of Wettest Month"                          10
## 16 "Precipitation of Wettest Quarter"                         7
## 17 "Precipitation Seasonality (Coefficient of Variation)"    23
## 18 "Temperature Annual Range (BIO5-BIO6)"                    15
## 19 "Temperature Seasonality (standard deviation ×100)"       17

To identify potentially more important variants amongst the significantly associated variants, we can predict the effect of the mutation on the protein sequence of genes. For this, I use SNPeff.

#rsync -avP ../5_GEA/Significant_GEMMA.tsv alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.tsv

#../Software/vcftools_jydu/src/cpp/vcftools --gzvcf /data2/alice/WW_project/1_Variant_calling/4_Joint_calling/Ztritici_global_March2021.genotyped.ALL.filtered.clean.AB_filtered.variants.good_samples.max-m-80.ann.vcf.gz --positions /data2/alice/WW_project/5_GEA/Significant_GEMMA.tsv --out /data2/alice/WW_project/5_GEA/Significant_GEMMA --recode --recode-INFO-all

#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/ANN\n' /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf > /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tsv

#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tsv ../5_GEA/

#grep "#CHROM" /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf | cut -f 1,2,4,5,10- | sed 's/#//' > /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab
#../Software/bcftools-1.10.2/bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%GT]\n' /data2/alice/WW_project/5_GEA/Significant_GEMMA.vcf >> /data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab
#rsync -avP alice@130.125.25.244:/data2/alice/WW_project/5_GEA/Significant_GEMMA.ann.tab ../5_GEA/
temp = read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tsv"), col_names = c("CHR", "BP", "REF", "ALT", "ANN")) %>%
  separate(ANN, sep = ",", 
           into = c("ANN1", "ANN2", "ANN3", "ANN4", "ANN5", "ANN6", "ANN7", "ANN8", "ANN9", "ANN10", "ANN11",
                    "ANN12", "ANN13", "ANN14", "ANN15", "ANN16", "ANN17"), 
           extra = "warn")

signif_ann = temp %>%
  pivot_longer(cols = c(-CHR, -BP, -REF, -ALT), names_to = "ANN", values_to = "Annotation") %>%
  filter(!is.na(Annotation)) %>%
  separate(Annotation, into = c("ALT2", "Variant_type", "Effect_strength", "Gene", "Rest"), sep = "\\|", extra = "merge") %>%
  filter(!str_detect(Gene, "-")) %>%
  separate(Gene, into = c("temp", "Chrom", "Gene_nb"), remove = FALSE)  %>%
  dplyr::select(-ANN, -ALT2, -temp) %>%
  mutate(Gene_nb = as.numeric(Gene_nb)) %>%
  left_join(., read_tsv(effectors_annotation_file) %>% 
              filter(Sample == "Zt09") %>% 
              mutate(Gene = str_replace(Protein_ID, "_chr", "")))
  

signif_ann %>% dplyr::count(CHR, Gene_nb, Effect_strength) %>%
  pivot_wider(names_from = Effect_strength, values_from = n, values_fill = 0)
## # A tibble: 1,435 x 6
##      CHR Gene_nb MODIFIER MODERATE   LOW  HIGH
##    <dbl>   <dbl>    <int>    <int> <int> <int>
##  1     1     229        1        0     0     0
##  2     1     230        1        0     0     0
##  3     1     231        1        0     0     0
##  4     1     232        0        1     0     0
##  5     1     233        1        0     0     0
##  6     1     320        4        0     0     0
##  7     1     321        4        0     0     0
##  8     1     322        4        0     0     0
##  9     1     323        3        1     0     0
## 10     1     324        4        0     0     0
## # … with 1,425 more rows
pheno = "Max Temperature of Warmest Month"

temp = results_GEMMA %>% filter(Phenotype == pheno) %>%
  left_join(., significant %>% dplyr::filter(Phenotype == pheno) ) %>%
  mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
  mutate(logP = -log10(P)) %>%
  left_join(., signif_ann) %>%
  filter(SNP_type == "significant")

Compare results between phenotypes

We have several variables that could be interpreted the same way. For example, we expect the amount of rain during the driest month and the driest quarter to give us similar information related to drought tolerance. It could be useful to identify useful SNPs to find the ones that are significantly associated to all variables of the same type. Let’s see if the p-values seem correlated between variables of the same types and what amount of significant variants are shared.

reshape_GEMMA_results <- function(phenotype_list) {
  temp = results_GEMMA %>%
    filter(Phenotype %in% phenotype_list) %>%
    mutate(logP = -log10(P)) %>%
    filter(logP > 2) %>%
    mutate(Phenotype_code = ifelse(Phenotype == phenotype_list[1], "Pheno1", "Pheno2"))%>%
    dplyr::select(SNP, Phenotype_code, logP)
  temp = temp %>% pivot_wider(values_from = logP, names_from = Phenotype_code) %>%
    mutate(Signif_max = ifelse(Pheno1 >= -log10(Bonferroni_threshold), "1", "0"),
           Signif_mean = ifelse(Pheno2 >= -log10(Bonferroni_threshold), "1", "0")) %>%
    unite(Significant, Signif_max, Signif_mean, sep = "", remove = F) 
  
  temp[complete.cases(temp),] 
}

dot_plot_comparison <- function(reshaped_GEMMA_results, phenotype_list) {
  ggplot(temp, aes(x = Pheno1, y = Pheno2, col = Significant)) +
    geom_point() +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
                 linetype = "dashed", color = "grey20")+
    geom_vline(aes(xintercept=-log10(Bonferroni_threshold)),
                 linetype = "dashed", color = "grey20") +
    theme_classic() +
    labs(x = phenotype_list[1], y = phenotype_list[1])
}



# Variables related to temperatures
# _________________________
warm_temp_var = c("Max Temperature of Warmest Month", "Mean Temperature of Warmest Quarter")
cold_temp_var = c("Min Temperature of Coldest Month", "Mean Temperature of Coldest Quarter")

#Significant for warm temperatures
temp = reshape_GEMMA_results(warm_temp_var)
table(temp[,"Significant"])
## 
##    00    01    10    11 
## 15613    30    55    30
p1 = dot_plot_comparison(temp, warm_temp_var)


#Significant for cold temperatures
temp = reshape_GEMMA_results(cold_temp_var)
table(temp[,"Significant"])
## 
##    00    01    10    11 
## 19057    29   221   315
p2 = dot_plot_comparison(temp, cold_temp_var)

cowplot::plot_grid(p1 +theme(legend.position = "none"), p2  + theme(legend.position = "none"))

# Variables related to rain
# _________________________
drought_var = c("Precipitation of Driest Month", "Precipitation of Driest Quarter")
rainy_var = c("Precipitation of Wettest Month", "Precipitation of Wettest Quarter")
core_rain_variable
## [1] "Precipitation of Wettest Month" "Precipitation of Driest Month"
#Significant for drought
temp = reshape_GEMMA_results(drought_var)
table(temp[,"Significant"])
## 
##    00    01    10    11 
## 14814     6     4    32
p1 = dot_plot_comparison(temp, drought_var)


#Significant for high precipitation
temp = reshape_GEMMA_results(rainy_var)
table(temp[,"Significant"])
## 
##    00    01    10    11 
## 16088    29     6    60
p2 = dot_plot_comparison(temp, rainy_var)

cowplot::plot_grid(p1 +theme(legend.position = "none"), p2  + theme(legend.position = "none"))

Instead of comparing the results based on a priori knowledge of the underlying wheather/climate patterns for each variables, it is possible tosimply look at the correlation between all pairs of variables. This yields a matrix that is similar to the one shown in the previous section but this time it takes into account the p-values in each test and not the values of the bioclimatic variables themselves.

# Keep only variants significant in at least one association test
temp = results_GEMMA %>%
  ungroup() %>%
  dplyr::select(-CHR, -BP, -Nb, -zscore) %>%
  distinct() %>%
  pivot_wider(names_from = Phenotype, values_from = P) %>%
  dplyr::select(-SNP) %>%
  filter(rowSums(. <= Bonferroni_threshold) >= 1)

#Correlation
Ccor = cor(temp)

#Correlogram
corrplot.mixed(Ccor, upper = "ellipse",
         #col = mycolorsCorrel,
         tl.col="black", tl.cex = 0.5, number.cex = .5)

#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)

It is also possible that some genes or some variants could be associated to different phenotypes. Here, we can use upset plots to visualise different comparisons:

  • How many genes are found associated between the different phenotypes tested? The genes are identified through the SnpEff pipeline.
  • How many variants are significantly associated to several phenotypes tested?
  • Amongst the variants which are found to be significant to all phenotypes belonging to the same category, how many are found to be common between phenotypes?
  • Amongst the variants which are found to be significant in at least one of the phenotype belonging to the same category, how many are found to be common between phenotypes?
# Per gene and per phenotype
gene_detected = inner_join(pheno_cat, significant) %>%
  mutate(logP = -log10(P)) %>%
  left_join(., signif_ann) %>% 
  mutate(Present = 1) %>%
  ungroup() %>%
  dplyr::select(Phenotype, Gene, Present) %>%
  distinct() %>%
  filter(!is.na(Gene)) %>%
  pivot_wider(names_from = Phenotype, values_from = Present, values_fill = 0)

for_upset = as.data.frame(gene_detected[,-1])
rownames(for_upset) = pull(gene_detected[,1])

upset(for_upset, sets = names(for_upset), 
      order.by = "freq", mb.ratio = c(0.5, 0.5), 
      mainbar.y.label = "Gene numbers")

#Per variant and per phenotype
temp = inner_join(pheno_cat, significant) 

temp2 = mutate(temp, Present = 1) %>%
  ungroup() %>%
  dplyr::select(Phenotype, SNP, Present) %>%
  distinct() %>%
  pivot_wider(names_from = Phenotype, values_from = Present, values_fill = 0)
  
for_upset = as.data.frame(temp2[,-1])
rownames(for_upset) = pull(temp2[,1])
upset(for_upset, sets = names(for_upset), 
      order.by = "freq", mb.ratio = c(0.5, 0.5),
      mainbar.y.label = "Variant numbers")

#Category: found in both
n_count = temp %>% group_by(Category) %>%
  dplyr::count(SNP)

signif_in_pairs = inner_join(temp, n_count %>% filter(n == 2))  %>%
  mutate(Present = ifelse(n == 2, 1, 0)) %>%
  dplyr::select(Category, SNP, Present) %>%
  distinct() %>%
  pivot_wider(names_from = Category, values_from = Present, values_fill = 0)

for_upset = as.data.frame(signif_in_pairs[,-1])
rownames(for_upset) = pull(signif_in_pairs[,1])

upset(for_upset, sets = names(for_upset), order.by = "freq", empty.intersections = "on" ,
      mainbar.y.label = "Variant numbers found in all phenotypes")

#Per category: found in at least one
signif_in_pairs = inner_join(temp, n_count) %>% 
  mutate(Present = ifelse(n == 2, 1, 1)) %>%
  dplyr::select(Category, SNP, Present) %>%
  distinct() %>%
  pivot_wider(names_from = Category, values_from = Present, values_fill = 0)

for_upset = as.data.frame(signif_in_pairs[,-1])
rownames(for_upset) = pull(signif_in_pairs[,1])

upset(for_upset, sets = names(for_upset), 
      order.by = "freq", empty.intersections = "on", 
      mainbar.y.label = "Variant numbers found in at least one phenotype" )

library(topGO)

pheno = "Temperature"
gene_detected = inner_join(pheno_table, significant) %>%
  mutate(logP = -log10(P)) %>%
  left_join(., signif_ann) %>%
  filter(Effect_strength %in% c("HIGH", "MODERATE"))

geneNames = read_tsv(gff_file, 
               col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot"))  %>%
  filter(mRNA == "mRNA") %>%
  separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
  mutate(Name = str_remove(Name, "Name=")) %>%
  pull(Name)

myInterestingGenes <- gene_detected %>%
  filter(Category == pheno) %>% 
  pull(Gene) %>% unique()
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
str(geneList)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "names")= chr [1:11839] "Zt09_5_00296" "Zt09_5_00590" "Zt09_5_00072" "Zt09_5_00316" ...
geneID2GO = readMappings(paste0(data_dir, "GO_Zt09.map"))

GOdata <- new("topGOdata", ontology = "MF", allGenes = geneList,
 annot = annFUN.gene2GO, gene2GO = geneID2GO)

resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
resultKS <- runTest(GOdata, algorithm = "classic", statistic = "ks")
resultKS.elim <- runTest(GOdata, algorithm = "elim", statistic = "ks")

allRes <- GenTable(GOdata, classicFisher = resultFisher,
                   classicKS = resultKS, elimKS = resultKS.elim,
                   orderBy = "elimKS", ranksOf = "classicFisher", topNodes = 10)
allRes
##         GO.ID                                        Term Annotated Significant
## 1  GO:0016409               palmitoyltransferase activity        10           0
## 2  GO:0003727                 single-stranded RNA binding        19           0
## 3  GO:0019904             protein domain specific binding        14           0
## 4  GO:0070063                      RNA polymerase binding        20           0
## 5  GO:0003746      translation elongation factor activity         7           0
## 6  GO:0001181 RNA polymerase I general transcription i...         3           0
## 7  GO:0051010                microtubule plus-end binding        10           0
## 8  GO:0015081 sodium ion transmembrane transporter act...         7           0
## 9  GO:0070181        small ribosomal subunit rRNA binding         6           0
## 10 GO:0043175          RNA polymerase core enzyme binding        19           0
##    Expected Rank in classicFisher classicFisher classicKS elimKS
## 1      0.13                   251             1    0.0041 0.0041
## 2      0.25                   252             1    0.0051 0.0051
## 3      0.18                   253             1    0.0070 0.0070
## 4      0.26                   254             1    0.0078 0.0078
## 5      0.09                   255             1    0.0085 0.0085
## 6      0.04                   256             1    0.0089 0.0089
## 7      0.13                   257             1    0.0117 0.0117
## 8      0.09                   258             1    0.0124 0.0124
## 9      0.08                   259             1    0.0135 0.0135
## 10     0.25                   260             1    0.0150 0.0150
showSigOfNodes(GOdata, score(resultKS.elim), firstSigNodes = 5, useInfo = 'all')

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 18 
## Number of Edges = 18 
## 
## $complete.dag
## [1] "A graph with 18 nodes."

Manhattan plots

Traditionally, GWAS results are represented as Manhattan plot. Let’s create some for the main variables we are interested in.

# Compute chromosome size and cumulative position of each chromosome
results_GEMMA_for_plot <- results_GEMMA %>% 
  group_by(CHR) %>% 
  dplyr::summarise(chr_len=max(BP)) %>% 
  mutate(tot=cumsum(chr_len)-chr_len) %>%
  dplyr::select(-chr_len) 

#EStimate middle of chromosome to center the labels
axisdf = results_GEMMA_for_plot %>%
  inner_join( ., results_GEMMA %>% filter(Nb == 1)) %>%
  arrange(CHR, BP) %>%
  mutate(BPcum = BP+tot) %>% 
  group_by(CHR) %>% 
  dplyr::summarise(center=( max(BPcum) + min(BPcum) ) / 2 )


# Manhattan plot function
Manhattan_one_pheno <- function(pheno, color_duo) {
  
  temp = results_GEMMA %>% filter(Phenotype == pheno) %>%
    inner_join(., results_GEMMA_for_plot)  %>%
    left_join(., significant %>% dplyr::filter(Phenotype == pheno) ) %>%
    mutate(SNP_type = ifelse(is.na(SNP_type), "Other", SNP_type)) %>%
    mutate(logP = -log10(P)) %>%
    filter(CHR <= 13) %>%
    filter(logP > 2) 
  
  ggplot(temp, aes(x=BP, y=logP)) +
    geom_point( aes(color=SNP_type), alpha = 0.8, size=1.3) +
    scale_color_manual(values = color_duo) +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
               linetype = "dashed", color = "grey20") +
    theme_bw() +
    theme( legend.position="none", panel.border = element_blank(),
      panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(),
      axis.title.x=element_blank(), axis.text.x=element_blank(),
      axis.ticks.x=element_blank(),
      strip.background = element_rect(colour="white", fill="white"),
      plot.margin = unit(c(0, 0, 0, 0), "cm")) +
      facet_grid(cols = vars(CHR), scales = "free_x") 
}

# Manhattan plot function 2
Manhattan_faceted_pheno <- function(pheno_list, color_duo) {
  results_GEMMA %>% filter(Phenotype %in% pheno_list) %>%
    inner_join(., results_GEMMA_for_plot)  %>%
    arrange(CHR, BP) %>%
    mutate(BPcum = BP+tot) %>%
    mutate(logP = -log10(P))%>%
    filter(logP > 2) %>%
    ggplot(., aes(x=BPcum, y=logP)) +
      geom_point( aes(color=as.factor(CHR)), alpha=0.8, size=1.3) +
      scale_color_manual(values = rep(color_duo, 22 )) +
      scale_x_continuous( label = axisdf$CHR, breaks= axisdf$center ) +
      geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
                 linetype = "dashed", color = "grey20") +
      theme_bw() +
      theme(legend.position="none",
        panel.border = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank()) +
      facet_grid(rows = vars(Phenotype), scales = "free_y") 
}


# Top SNPs
top_SNPs_density <- function(pheno, min_length_locus, custom_colors){
  temp = significant %>% 
    filter(Phenotype == pheno) %>% 
    group_by(CHR, Locus_nb) %>%
    mutate(Locus_length = 1 + max(BP) - min(BP)) %>%
    filter(Locus_length >= min_length_locus) %>%
    mutate(Rank = rank(P, ties.method = "first")) %>%
    arrange(Rank) %>%
    ungroup() %>%
    filter(Rank == 1) %>%
    dplyr::select(SNP, CHR, BP)
  
  relevant_climate = climate_per_coordinates %>%
    pivot_longer(-c(X,Y), names_to = "phenotype", values_to = "Bioclim") %>%
    filter(phenotype == pheno) %>%
    left_join(., Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude), 
              by = c("X" = "Longitude", "Y" = "Latitude"))
  
  temp = read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tab"), na = ".") %>%
    inner_join(temp, by = c("CHROM" = "CHR", "POS" = "BP")) %>%
    pivot_longer(-c(CHROM, POS, REF, ALT, SNP), 
                 names_to = "ID_file", values_to = "Allele") %>%
    inner_join(., relevant_climate) %>%
    filter(Allele != "NA")
  
  temp %>%
    ggplot(aes(Bioclim, fill = as.factor(Allele), col = as.factor(Allele))) +
    geom_density(alpha = .4) + 
    facet_wrap(vars(CHROM, POS), scales = "free") +
    theme_bw() +
    scale_color_manual(values = custom_colors) +
    scale_fill_manual(values = custom_colors)
  
  #temp %>% dplyr::count(SNP, Allele) %>%
  #  pivot_wider(names_from = Allele, values_from = n)

}


# Gene/TE tracks plot
plot_gff <- function(gff_name, chromosome, min_coord, max_coord, genes_to_highlight, col = "black") {
  gff = read_tsv(gff_name, 
               col_names = c("CHR", "X1", "mRNA", "Start", "Stop", "X2", "X3", "X4", "Annot"))  %>%
  separate(col = Annot, into = c("ID", "Parent", "Name"), sep = ";") %>%
  mutate(ID = str_remove(ID, "ID=")) %>% 
  filter(CHR == chromosome) %>%
  filter(Start >= min_coord) %>%
  filter(Stop <= max_coord) %>%
  mutate(y_value = rank(Start)) %>%
  arrange(Start)

## If too many genes, organize them
if (nrow(gff) > 5) {
  temp = ceiling(nrow(gff)/5)
  gff = gff %>% mutate(y_value = rep(c(1:5), temp)[1:nrow(gff)])
}

## Make plot
c = gff %>%
  ggplot() +
  geom_segment(mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), size = 2, col = col) +
  theme_bw() + 
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), 
        axis.title.x = element_blank(), axis.ticks.y = element_blank(), 
        panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank()) +
  coord_cartesian(xlim = c(min_coord, max_coord), ylim = c(min(gff$y_value) - 0.5, max(gff$y_value) + 0.5))

## Highlight interesting genes
temp = match(genes_to_highlight, gff$ID)
if ( sum(!is.na(temp)) > 0 ){
  c = c + geom_segment(data = filter(gff, ID == genes_to_highlight),
                   mapping = aes(x = Start, xend = Stop, y = y_value, yend = y_value), 
                   size = 3, col = "blue")
}
## If there are a reasonable number of genes, 
# I will also add their name on the plot
if (nrow(gff) <= 15) {
  c = c + geom_text(aes(x = Stop + (max_coord - min_coord)*0.05 , y = y_value, label = ID), size = 2) 
}

c
}

I want to have a main plot for temperature and for precipitation. First, I choose the maximum and minimum temperature.

# Manhattan plot with both max and min temperature with mirrored effect
p = Manhattan_one_pheno("Mean Temperature of Warmest Quarter", c("#F5B9B3", "#F28482")) +
  labs(y = str_wrap("Mean Temperature of Warmest Quarter (-logP)", 25))

q = Manhattan_one_pheno("Min Temperature of Coldest Month", c("#C4E7FD", "#0889D9")) +
  labs(y = str_wrap("Min Temperature of Coldest Month (-logP)", 25)) +
  scale_y_reverse() + theme(strip.text.x = element_blank())

cowplot::plot_grid(p, q, nrow = 2)

ggsave(paste0(fig_dir, "GEA_Manhattan_temperature_main.pdf"), width = 18, height = 10, units = "cm")

top_SNPs_density("Mean Temperature of Warmest Quarter", 2000, c("#F5B9B3", "#F28482", "grey"))

top_SNPs_density("Min Temperature of Coldest Month", 2000, c("#C4E7FD", "#0889D9", "grey"))

Then, for precipitation, I choose the precipitation of the driest and of the wettest month.

#  Manhattan plot for core precipitation
p = Manhattan_one_pheno("Precipitation of Driest Month", c("#FFD399", "#ff9f1c")) +
  labs(y = str_wrap("Precipitation of Driest Month (-logP)", 25))

q = Manhattan_one_pheno("Precipitation of Wettest Month", c("#cbf3f0", "#2ec4b6")) +
  scale_y_reverse() +
  labs(y = str_wrap("Precipitation of Wettest Month (-logP)", 25))

cowplot::plot_grid(p, q, nrow = 2)

ggsave(paste0(fig_dir, "GEA_Manhattan_rain_main.pdf"), width = 18, height = 10, units = "cm")

top_SNPs_density("Precipitation of Driest Month", 2000, c("#FFD399", "#ff9f1c", "grey"))

top_SNPs_density("Precipitation of Wettest Month", 2000, c("#cbf3f0", "#2ec4b6", "grey"))

I also want to have a Manhattan plot for all possible environmental variable. I do try to classify them, grouping conceptually similar variables together.

#Yearly variables
Manhattan_faceted_pheno(Bioclim_var[c(2, 3, 7)], c("grey", "skyblue")) +
  labs(title = "Ranges bioclimatic variables")

Manhattan_faceted_pheno(Bioclim_var[c(4, 15)], c("grey", "skyblue")) +
  labs(title = "Seasonality bioclimatic variables")

Manhattan_faceted_pheno(Bioclim_var[c(1, 12)], c("grey", "skyblue")) +
  labs(title = "Annual rain and temperature")

#Temperatures
Manhattan_faceted_pheno(warm_temp_var, c("grey", "skyblue")) +
  labs(title = "High temperatures")

Manhattan_faceted_pheno(cold_temp_var, c("grey", "skyblue")) +
  labs(title = "Cold temperatures")

#Rain
Manhattan_faceted_pheno(drought_var, c("grey", "skyblue")) +
  labs(title = "Drought")

Manhattan_faceted_pheno(rainy_var, c("grey", "skyblue")) +
  labs(title = "Rain")

#Mix rain and temperature
Manhattan_faceted_pheno(Bioclim_var[c(8, 9, 17, 18)], c("grey", "skyblue")) +
  labs(title = "Mix rain and temperature")

i=10
#SNP_name = "3_444010"
#SNP_name = "7_1449518"
SNP_name = "10_460207"
temp2 = raster(paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif"))
temp = as.data.frame(rasterToPoints(temp2, xy = TRUE))
colnames(temp) <- c("X", "Y", "Bioclim_var")

map1 = ggplot() +
  geom_raster(data = temp , 
              aes(x = X, y = Y, 
                  fill = Bioclim_var)) + 
  theme_void() +
  scale_fill_viridis("inferno") +
  theme(panel.border  = element_rect(fill=NA, colour = "grey",size = 0.5, linetype = 1))
#panel.background = element_rect(fill = "aliceblue"))
p1 = map1 + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70))
p2 = map1 + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80))
p3 = map1 + coord_cartesian(xlim=c(105, 175), ylim=c(-65, 10))
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "None"), get_legend(p1), ncol = 2, rel_widths = c(1.1, 1))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "None"), aus_map, ncol = 1,  rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "None"), ligne, ncol = 2, rel_widths = c(0.7, 1)) 

ggsave(paste0(fig_dir, "Map_BIO", i, ".pdf"), width = 18, height = 10, units = "cm")

allele = significant %>% 
    filter(Phenotype == Bioclim_var[i]) %>% 
    filter(SNP == SNP_name) %>%
    dplyr::select(SNP, CHR, BP) %>%
    inner_join(., read_tsv(paste0(GEA_dir, "Significant_GEMMA.ann.tab"), na = "."), by = c("CHR" = "CHROM", "BP" = "POS")) %>%
    pivot_longer(-c(CHR, BP, REF, ALT, SNP), 
                 names_to = "ID_file", values_to = "Allele") %>%
    inner_join(., Zt_meta %>% dplyr::select(ID_file, Latitude, Longitude)) %>%
    mutate(Latitude = round(Latitude/2)*2, Longitude = round(Longitude/2)*2)%>%
    mutate(Allele = ifelse(Allele == 0, "REF", ifelse(Allele == 1, "ALT1", "ALT2"))) %>%
    group_by(Allele, Longitude, Latitude) %>%
    dplyr::count() %>%
    pivot_wider(names_from = Allele, values_from = n, values_fill = 0) %>%
    filter(!is.na(Longitude)) %>%
    mutate(Radius = 2) 

#Add if column alt2
if ("ALT2" %in% colnames(allele)) {
  allele = allele
} else {
  allele = allele %>% mutate(ALT2 = 0)
}

p1 = ggplot() + coord_cartesian(xlim=c(-20, 60), ylim=c(20, 70)) + 
  geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius), 
                  cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8) +
  theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
    plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
    panel.grid.major = element_blank(), # get rid of major grid
    panel.grid.minor = element_blank(), # get rid of minor grid
    legend.background = element_rect(fill = "transparent"), # get rid of legend bg
    legend.box.background = element_rect(fill = "transparent"))

p2 = ggplot() + coord_cartesian(xlim=c(-160, -40), ylim=c(-80, 80)) + 
  geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius*2), 
                  cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8)+
  theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
    plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
    panel.grid.major = element_blank(), # get rid of major grid
    panel.grid.minor = element_blank(), # get rid of minor grid
    legend.background = element_rect(fill = "transparent"), # get rid of legend bg
    legend.box.background = element_rect(fill = "transparent"))

p3 = ggplot() + coord_cartesian(xlim=c(105, 175), ylim=c(-65, 10)) + 
  geom_scatterpie(data = allele, mapping = aes(x=Longitude, y=Latitude, r = Radius*2), 
                  cols = c("REF", "ALT1", "ALT2"), color=NA, alpha=.8)+
  theme(panel.background = element_rect(fill = "transparent"), # bg of the panel
    plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
    panel.grid.major = element_blank(), # get rid of major grid
    panel.grid.minor = element_blank(), # get rid of minor grid
    legend.background = element_rect(fill = "transparent"), # get rid of legend bg
    legend.box.background = element_rect(fill = "transparent"))

#Plotting all the maps together! 
aus_map = cowplot::plot_grid(p3 + theme(legend.position = "None"), get_legend(p1), ncol = 2, rel_widths = c(1.1, 1))
ligne = cowplot::plot_grid(p1 + theme(legend.position = "None"), aus_map, ncol = 1,  rel_heights = c(1, 0.7))
cowplot::plot_grid(p2 + theme(legend.position = "None"), ligne, ncol = 2, rel_widths = c(0.7, 1)) 

ggsave(paste0(fig_dir, "Allele_", SNP_name,"_BIO", i, ".pdf"), width = 18, height = 10, units = "cm")
#Set parameters for the plot
chromosome = 7
min_coord = 1094086
max_coord = 1892455
#pheno = "Precipitation of Driest Month"
pheno = "Precipitation of Warmest Quarter"
genes_to_highlight = c("Zt09_model_5_00004")
TE_gff = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Cecile_IPO323_refTEs_match.gff"
gene_gff = "/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/0_Data/Zymoseptoria_tritici.MG2.Grandaubert2015.mRNA.gff3"

# Prepare the gene section of the plot
c = plot_gff(TE_gff, chromosome, min_coord, max_coord, genes_to_highlight, "grey50")
d = plot_gff(gene_gff, chromosome, min_coord, max_coord, genes_to_highlight)

# Prepare the GWAS section of the plot for the first phenotype
b = results_GEMMA %>%
  filter(Phenotype == pheno) %>%
  filter(CHR == chromosome) %>%
  filter(BP >= min_coord) %>%
  filter(BP <= max_coord) %>%
  mutate(logP = -log10(P)) %>%
  ggplot() +
  geom_point(aes(x = BP, y = logP, col = logP)) +
    geom_hline(aes(yintercept=-log10(Bonferroni_threshold)),
               linetype = "dashed", color = "grey20") +
  theme_bw() +
  labs(title = paste0("GWAS: ", pheno),
       subtitle = paste0("The region is on the chromosome ", chromosome, 
                           " from ", min_coord, "bp to ", max_coord, "bp.")) +
  theme(axis.title.x = element_blank())

cowplot::plot_grid(b, c, d, ncol = 1, align = 'v', axis = 'lr', rel_heights = c(1, 0.3, 0.3))



Fungicide resistance


This is based on the same scripts made for Fanny Hartmann’s paper on the pairs of populations: looking for alleles already known to be involved in resistance to fungicide.

genotypes_resistance = read_tsv(paste0(fung_dir, "genotypes_tidy_format.tab"),
                                col_names = c("temp", "ID_file", "Allele")) %>%
  separate(temp, into = c("Gene", "AA_change"), sep = "\\.") %>%
   dplyr::mutate(AA_REF = str_sub(AA_change, 1, 1)) %>%
  dplyr::mutate(Allele_type = ifelse(AA_REF == Allele, "Origin", "Mutant")) %>%
  left_join(Zt_meta)


genotypes_resistance %>%
  dplyr::count(AA_change, Gene, Allele_type) %>%
  ggplot(aes(x=n, y=AA_change, fill = Allele_type)) +
  geom_barh(stat="identity") +
  facet_grid(Gene ~ ., scales = "free_y", space = "free_y")

I suspect that there will be an effect of both geography and time on the frequency of these alleles and so I try to represent this here.

#CYP51
temp = genotypes_resistance %>%
  filter(Gene == "CYP51") %>%
  dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(prop_mutant = Mutant/(Mutant + Origin)) %>%
  filter(!is.na(Sampling_Date)) %>%
  filter(!is.na(Continent)) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0)

temp %>%
  ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + 
  scale_fill_gradient(low="white", high = "#16697a") +
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the gene CYP51"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance to azole fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

#Beta_tubulin
genotypes_resistance %>%
  filter(Gene == "beta_tubulin") %>%
  dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the beta tubuline gene"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance",
                                 " to benzimidazole fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

# Mitochondrial_cytb
genotypes_resistance %>%
  filter(Gene == "mitochondrial_cytb") %>%
  dplyr::count(AA_change, Continent,Sampling_Date, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(AA_change) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in the mitochondrial gene cytb"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance ",
                                 "to QoI, or Quinone outside inhibitors."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")

# SDH genes
genotypes_resistance %>%
  filter(grepl("SDH", Gene)) %>%
  unite(Label, Gene, AA_change, remove = T) %>%
  dplyr::count(Label, Continent, Sampling_Date, Allele_type) %>%
  pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Mutant + Origin) %>%
  mutate(prop_mutant = Mutant/Number_all) %>%
  group_by(Label) %>%
  dplyr::mutate(Somme = sum(Mutant)) %>%
  filter(Somme > 0) %>%
  ggplot(aes(x=Sampling_Date, y=Continent, size = prop_mutant, fill = prop_mutant)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  facet_wrap(.~Label) + scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Mutant proportion per known mutation",
                              "in one of the SDH genes"), width = 35),
       subtitle = str_wrap(paste("This gene can mutate and cause resistance to SDHI fungicides."), width = 65),
       fill = "Proportion of mutants",
       size = "Proportion of mutants",
       x = "Years", y = "")



Virulence to new varieties


Thierry Marcel’s virulence GWAS

I use the alleles identified in the GWAS of Thierry Marcel and track them in the world-wide populations.

virulence_table = read_tsv(paste0(virulence_dir, "French_GWAS_virulence_alleles.tab")) %>%
  separate(VIRULENCE_ALLELES, into = c("VIRULENCE_ALLELE1", "VIRULENCE_ALLELE2"), sep = ",")

french_GWAS_variants = read_tsv(paste0(virulence_dir, "Positions_GWAS.short.vcf"),
                                na = ".")  %>%
  pivot_longer(-c(`#CHROM`, POS, REF, ALT), names_to = "ID_file", values_to = "Allele_nb") %>%
  separate(ALT, into = c("ALT1", "ALT2"), sep = ",") %>%
  mutate(Allele_ID = ifelse(Allele_nb == 0, REF, ifelse(Allele_nb == 1, ALT1, ALT2))) %>%
  left_join(., virulence_table) %>%
  mutate(virulence = ifelse(Allele_ID == NON_VIRULENT_ALLELES, "Non_virulent",
                            ifelse(Allele_ID == VIRULENCE_ALLELE1 | Allele_ID == VIRULENCE_ALLELE2,
                                   "Virulent", NA)))

french_GWAS_variants = inner_join(french_GWAS_variants, Zt_meta)


french_GWAS_variants %>%
  tidyr::unite(position, `#CHROM`, POS) %>%
  dplyr::count(position, Continent,Sampling_Date, virulence) %>%
  pivot_wider(names_from = virulence, values_from = n, values_fill = list(n = 0)) %>%
  mutate(Number_all = Virulent + Non_virulent) %>%
  mutate(prop_virulent = Virulent/Number_all) %>%
  ggplot(aes(x=Sampling_Date, y=Continent, size = prop_virulent, fill = prop_virulent)) +
  geom_point(shape=21, alpha =0.7) + theme_bw() +
  scale_fill_gradient(low="white", high = "#16697a")+
  labs(title = str_wrap(paste("Virulence alleles",
                              ""), width = 35),
       subtitle = str_wrap(paste("These alleles were identified in a GWAS study",
                                 " to increase either virulence or aggressiveness",
                                 " on some wheat varieties."), width = 65),
       fill = "Proportion of virulent",
       size = "Proportion of virulent",
       x = "Years", y = "") +
  facet_wrap(vars(position))

Welll… Whatever…

Avr_Stb6

#On the cluster: run blast
while read sample ; do sh Detect_gene_blast.sh ./Directories_new.sh /data2/alice/WW_project/7_Virulence/Avr_Stb6.fasta ${sample} Illumina /data2/alice/WW_project/7_Virulence/${sample}_Avr_Stb6.blast.tab; done < list_Third_batch_Dec_2018.txt

#On the cluster: Gather blast results
cat /data2/alice/WW_project/7_Virulence/*Avr_Stb6.blast.tab > /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt

#On the cluster: Gather blast results
while read p ;
 do
sample=$(echo $p | cut -f 1 -d " " ) ;
 chr=$(echo $p | cut -f 4 -d " ") ;
 start=$(echo $p | cut -f 11 -d " ") ; end=$(echo $p | cut -f 12 -d " ") ;
 echo $sample, $chr, $start, $end ;
 order_start=$(echo -e "$start\n$end"  | sort -n | head -n1);
 order_end=$(echo -e "$start\n$end"  | sort -nr | head -n1);
 echo -e "${chr}\t${order_start}\t${order_end}" > temp.bed ;
 ~/Software/bedtools getfasta \
   -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta\
   -bed temp.bed | \
   sed "s/>/>${sample}:/" >> \
 /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.fasta ;
done < /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt

#And on the website, because laziness
mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output

Here is the tree for the Avr_Stb6 gene. For reference, the allele carried by ST99CH_1A4 gives an avirulent phenotype, while the allele from ST99CH_1A5 is the virulent allele (from Zhong et al. 2017).

#grep ">" ../7_Virulence/Overall_results_blast_Avr_Stb6.fasta | sed 's/>//' > ../7_Virulence/Overall_results_blast_Avr_Stb6.list

temp2 = read_tsv(paste0(virulence_dir, "Overall_results_blast_Avr_Stb6.list"), col_names = "Fasta_header") %>%
  separate(Fasta_header, into = c("ID_file", "chr", "coords"), sep=":", remove = F) %>%
  mutate(contig2 = stringr::str_replace_all(Fasta_header, "[:.]", "_")) %>%
  left_join(.,Zt_meta)

#Read tree in
#details from the mafft website about the tree
#Size = 640 sequences × 319 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 0
#Alignment id = .200930173318269H5fPcXaMbnDOkoF7E0Y3ilsfnormal


tree_file = "Tree_Avr_Stb6.nwk"
tree = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file))) %>%
  mutate(label_copy = label) %>%
  separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
  dplyr::mutate(nb = as.integer(nb)) %>%
  dplyr::mutate(label_OG = label) %>%
  dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
  left_join(., temp2, by = c("contig2")) %>%
  dplyr::mutate(label_temp = ifelse(is.na(ID_file), ifelse(is.na(contig), label, contig), ID_file)) %>%
  unite(col = "label_new", nb, label_temp, sep = "_")

tree2 = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file)))

temp = tree %>%
  mutate(original_vir = ifelse(ID_file == "ST99CH_1A5", "ST99CH_1A5",
  ifelse(ID_file == "ST99CH_1E4", "ST99CH_1E4", NA))) %>%
  dplyr::select(label, node, label_new, Continent, Sampling_Date, Collection, original_vir) %>%
  filter(!is.na(label))

#Find the outgroup!
root_node = tree2 %>%
  filter(grepl("Zp13", label)) %>%
  dplyr::select(node) %>%
  pull()


rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))

p2 <- p + geom_tippoint(aes(color = Continent), alpha = 0.7) +
  theme(legend.position = "right") +
  Color_Continent

#p2

p3 <- p + geom_tippoint(aes(color = original_vir)) +
  theme(legend.position = "right")  +
  scale_color_manual(values = c(mycolorsCorrel[1], mycolorsCorrel[20]))
cowplot::plot_grid(p2, p3)

p4 <- p + geom_tippoint(aes(color = Sampling_Date), alpha = 0.7) +
  theme(legend.position = "right")
cowplot::plot_grid(p4, p3)